Packages

library(tinytex)
library(tidyverse)
library(lubridate) 
# This one is for dates
# install.packages("DataExplorer")
library(DataExplorer)
library(meltr)
# install.packages("plotly")
library(plotly)
library(data.table)
# install.packages("installr") for updating R
library(installr)

The Data

Data <- readr::read_delim("C:/Users/Chris/OneDrive/R project/Customer Personality Analysis/marketing_campaign.csv")
## Rows: 2240 Columns: 29
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Option for future: use na.strings = "..": it replaces blanks with NA
# fileENcoding = "UTF-8-BOM" R reads characters as correctly as they would appear on the raw dataset

Data <- Data  %>% select(-ID, -Z_CostContact,-Response, -Z_Revenue, -Complain) %>%   filter(Income != 666666) %>%   mutate(Dt_Customer = as.Date(Dt_Customer, format = "%d-%m-%Y"))# This was added later
str(Data)
## tibble [2,215 x 24] (S3: tbl_df/tbl/data.frame)
##  $ Year_Birth         : num [1:2215] 1957 1954 1965 1984 1981 ...
##  $ Education          : chr [1:2215] "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr [1:2215] "Single" "Single" "Together" "Together" ...
##  $ Income             : num [1:2215] 58138 46344 71613 26646 58293 ...
##  $ Kidhome            : num [1:2215] 0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : num [1:2215] 0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : Date[1:2215], format: "2012-09-04" "2014-03-08" ...
##  $ Recency            : num [1:2215] 58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : num [1:2215] 635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : num [1:2215] 88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : num [1:2215] 546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : num [1:2215] 172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : num [1:2215] 88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : num [1:2215] 88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : num [1:2215] 3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : num [1:2215] 8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: num [1:2215] 10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : num [1:2215] 4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : num [1:2215] 7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : num [1:2215] 0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...

The Customer Personality Analysis dataset holds information on basic customer attributes such as birth year, education, income as well as their spending habits relating to wine, fruits meat, fish. Data also includes whether customers made purchases with a discount, through the web or with the help of a catalogue.

There are a couple of columns towards the end that I’m unclear on regarding their purpose. These are: Complain, Z_CostContact, Z_Revenue and Response. I think it is safe to remove them from the analysis. We also need to convert the date column into a date properly.

# Data <- Data %>% 
#  select(-Z_CostContact, - Response, -Z_Revenue, -Complain) %>% 
#  mutate(Dt_Customer = as.Date(Dt_Customer, format = "%d-%m-%Y")) Moved to the top

For the correlation matrix later we don’t need ID or customer joining date:

Data_cor <- Data %>% select_if(is.numeric)
str(Data)
## tibble [2,215 x 24] (S3: tbl_df/tbl/data.frame)
##  $ Year_Birth         : num [1:2215] 1957 1954 1965 1984 1981 ...
##  $ Education          : chr [1:2215] "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr [1:2215] "Single" "Single" "Together" "Together" ...
##  $ Income             : num [1:2215] 58138 46344 71613 26646 58293 ...
##  $ Kidhome            : num [1:2215] 0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : num [1:2215] 0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : Date[1:2215], format: "2012-09-04" "2014-03-08" ...
##  $ Recency            : num [1:2215] 58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : num [1:2215] 635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : num [1:2215] 88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : num [1:2215] 546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : num [1:2215] 172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : num [1:2215] 88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : num [1:2215] 88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : num [1:2215] 3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : num [1:2215] 8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: num [1:2215] 10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : num [1:2215] 4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : num [1:2215] 7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : num [1:2215] 0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num [1:2215] 0 0 0 0 0 0 0 0 0 0 ...

This seems ok now. No real need to rename columns.

view(Data)

Exploratory Data Analysis

plot_str(Data, type = "d")
# Can choose between d for diagonal or r for radial plots

This is a nice plot for seeing the structure of the data. If you have complicated lists of lists this works nicely to see network branches.

introduce(Data)
## # A tibble: 1 x 9
##    rows columns discrete_columns continuous_columns all_missing_columns
##   <int>   <int>            <int>              <int>               <int>
## 1  2215      24                3                 21                   0
## # ... with 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>

This is a quick way to see some basic information - especially the missing values that will get removed in most scenarios.

plot_missing(Data)

There are some missing values in the Income column. Perhaps we will remove them for analysis relating income to other categories. Removed from the data at the start so it should show all 0’s now.

plot_histogram(Data[ , -1], nrow = 2) # -1 because we don't need ID here

plot_density(Data[ , -1], nrow = 2) # We don't need ID here. Note that this is useless for the discrete variables. Less useful than the histograms

Filter out the discrete variables - density isn’t good for a lot of these.

Colourful Correlation Plot

A nice DIY correlation plot that I believe looks more aesthetic than most of those in packages:

# FUll code

# library(plotly)
# library(data.table)

# For the correlation plot, we only want the numeric columns. 
# Here is one way:
#nums <- unlist(lapply(Data_cor), is.numeric)
#Data_cor[ ,nums]

# This one is cool:
#Data_cor[ ,map_lgl(Data_cor, is.numeric)]

# So is this:
#select_if(Data_cor, is.numeric)

# and...:
#Data_cor %>% select(where(is.numeric))


# Data_cor is the dataset (not correlation matrix) you want to use
corrdata <- cor(na.omit(select_if(Data_cor, is.numeric)))
corrdata[upper.tri(corrdata, diag = TRUE)] <- NA
corrdata <- corrdata[-1, -ncol(corrdata)] # take out the first row (no 1-1 correlations)

# Storing variable names for later use
x_labels <- colnames(corrdata)
y_labels <- rownames(corrdata)

# Change variable names to numeric for the grid
colnames(corrdata) <- 1:ncol(corrdata)
rownames(corrdata) <- nrow(corrdata):1

# Melt the data into  the desired format
plotdata <- melt(corrdata)
## Warning in melt(corrdata): The melt generic in data.table has been passed a
## matrix and will attempt to redirect to the relevant reshape2 method; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(corrdata). In
## the next version, this warning will become an error.
# Adding size variable and scaling it. $value is the correlation value
plotdata$size <- (abs(plotdata$value))
scaling <- 500/ncol(corrdata)/2
plotdata$size <- plotdata$size*scaling 

# Setting x and y ranges for the chart
# We used unit values for initial grid, so shift by 0.5 to create gridlines
xrange <- c(0.5, length(x_labels) + 0.5)
yrange <- c(0.5, length(y_labels) + 0.5)

# Setting the gridlines
x_grid <- seq(1.5, length(x_labels) - 0.5, 1)
y_grid <- seq(1.5, length(x_labels) - 0.5, 1)

# Now some cleanup. Naming variables and removing gridlines

xAx1 <- list(showgrid = FALSE, showline = FALSE, zeroline = FALSE, tickvals = colnames(corrdata), ticktext = x_labels, title = FALSE)

xAx2 <- list(showgrid = FALSE, showline = FALSE, zeroline = FALSE, overlaying = "x", showticklabels = FALSE, range = xrange, tickvals = x_grid)

yAx1 <- list(autoaxis = FALSE, showgrid  = FALSE, showline = FALSE, zeroline = FALSE, tickvals = rownames(corrdata), ticktext = y_labels, title = FALSE)

yAx2 <- list(showgrid = TRUE, showline = FALSE, zeroline = FALSE, overlaying = "y", showticklabels = FALSE, range = yrange, tickvals = y_grid)

fig <- plot_ly(data = plotdata, width = 500, height = 500)

fig <- fig %>% add_trace(x = ~Var2, y = ~Var1, type = "scatter", mode = "markers",
                        color = ~value,
                        marker = list(size = ~size, opacity = 1),
                        symbol = I("square"),
                        text = ~value,
                        hovertemplate = "%{text:.2f} <extra></extra>",
                        xaxis = "x1",
                        yaxis = "y1")

fig <- fig %>% add_trace(x = ~Var2, y = ~Var1, type = "scatter", mode = "markers",
                        opacity = 0,
                        showlegend = FALSE,
                        xaxis = "x2",
                        yaxis = "y2",
                        hoverinfo = "none")

fig <- fig %>% layout(xaxis = xAx1,
                     yaxis = yAx1, 
                     xaxis2 = xAx2,
                     yaxis2 = yAx2,
                     plot_bgcolor = "rgba(0,0,0,0)",
                     paper_bgcolor = "rgba(0, 0, 0, 0.03)")

fig <- fig %>% colorbar(title = "", limits = c(-1,1), x = 1.1, y = 0.75)
fig

Note to self: Fill in or remove the NA’s otherwise you’ll get an NA in your correlation matrix. Some interesting relationships for income are that the number of web visits and number of kids at home is negatively correlated to income. Perhaps consumers who are wealthier and can afford the extra delivery fees are ordering online. There are positive relations to the amount of wines, fruits, meat products, sweets, catalogue, and store purchases, which intuitively makes sense.

Bar Chart

plot_bar(Data) # Note that this ignores columns with > 50 categories. Such as the date column and income.
## 1 columns ignored with more than 50 categories.
## Dt_Customer: 662 categories

It appears the number of people using coupons is around 200 per campaign, so a little less than 10% of persons recorded. These however are quite ugly so I’ll try make something nicer using ggplot directly.

Data %>% 
  group_by(Marital_Status) %>% 
  summarize(Income = mean(Income, na.rm = TRUE)) %>% 
  ggplot(aes(x = reorder(Marital_Status, Income), y = Income)) +
  geom_bar(stat = "identity", aes(fill = Income)) +
  coord_flip() +
  theme_grey() +
  scale_fill_gradient(name = "Income Level")

# lots of colour options I could do here, but blue looks aight
# reorder so it goes from high to low

I suspect that YOLO, Alone, Absurd are answers given by providing “Other” responses from the survey. They should be removed from the plot and further analysis.

Data %>% 
  filter(!(Marital_Status == "Alone" | Marital_Status == "YOLO" | Marital_Status == "Absurd")) %>% 
  group_by(Marital_Status) %>% 
  summarize(Income = mean(Income, na.rm = TRUE)) %>% 
  ggplot(aes(x = reorder(Marital_Status, Income), y = Income)) +
  geom_bar(stat = "identity", aes(fill = Income)) +
  coord_flip() +
  theme_grey() +
  scale_fill_gradient(name = "Income Level") +
  labs(title = "Average Income for Customers based on Marital Status", x = "Marital Status", y = "Income")

### Violin Plot

plot_scatterplot(Data_cor, by = "Income")

There is one data point that is unusally high for income. Let’s investigate it:

Data %>% slice_max(Income, n = 10)
## # A tibble: 10 x 24
##    Year_Birth Education  Marital_Status Income Kidhome Teenhome Dt_Customer
##         <dbl> <chr>      <chr>           <dbl>   <dbl>    <dbl> <date>     
##  1       1976 PhD        Together       162397       1        1 2013-06-03 
##  2       1982 PhD        Married        160803       0        0 2012-08-04 
##  3       1971 Master     Together       157733       1        0 2013-06-04 
##  4       1973 PhD        Married        157243       0        1 2014-03-01 
##  5       1977 Graduation Together       157146       0        0 2013-04-29 
##  6       1949 PhD        Married        156924       0        0 2013-08-29 
##  7       1975 Graduation Divorced       153924       0        0 2014-02-07 
##  8       1945 PhD        Single         113734       0        0 2014-05-28 
##  9       1970 Graduation Together       105471       0        0 2013-01-21 
## 10       1974 Graduation Divorced       102692       0        0 2013-04-05 
## # ... with 17 more variables: Recency <dbl>, MntWines <dbl>, MntFruits <dbl>,
## #   MntMeatProducts <dbl>, MntFishProducts <dbl>, MntSweetProducts <dbl>,
## #   MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## #   NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## #   NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## #   AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>
Data <- Data %>% 
  filter(Income != 666666)

The largest number is 666,666! Seeing it is such a big outlier and how it’s entered removing it from further analysis

Data <- Data %>% 
  filter(Income != 666666)

Let’s have a look at the scatterplots now:

plot_scatterplot(Data, by = "Income")

There seem to be a couple of outliers who earn very high income, but do not purchase much of anything. Otherwise there seems to be a (expected) linear increase between number of store purchases and income.

Honestly these scatterplots are a bit ugly so let’s make something nicer: a violin plot.

# install.packages("ggbeeswarm")
library(ggbeeswarm)
library(ggpubr)
Data %>% 
  filter(!(Marital_Status == "Alone" | Marital_Status == "YOLO" | Marital_Status == "Absurd")) %>% 
  ggplot(aes(x = Marital_Status, y = Income, fill = Marital_Status)) +
  scale_fill_viridis_d(option = "D") +
  geom_violin(alpha = 0.5, position = position_dodge(width = 0.75), size = 1, colour = NA) +
  geom_boxplot(notch = TRUE, outlier.size = -1, colour = "black", lwd = 1, alpha = 0.7, show.legend = F) + 
  geom_point(shape = 21, size = 1, position = position_jitterdodge(), colour="black",alpha=1) +
  theme_pubr() + # moves legend to top and removes some axis lines. This has to be used early
  ggbeeswarm::geom_quasirandom(shape = 21, size = 1, dodge.width = 0.75, colour = "black", alpha = 0.5, show.legend = F) +
  theme_minimal() +
  ylab(c("Income of Customer")) +
  xlab(c("Marital Status of Customer")) + 

  theme(panel.border = element_rect(colour = "gray", fill = NA, size = 2),
        axis.line = element_line(colour = "black", size = 1),
        axis.ticks = element_line(colour = "black"),
        axis.text = element_text(colour = "black"),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
        ) +
  font("xylab", size = 15) +
  font("xy", size = 15) +
  font("xy.text", size =  15) +
  font("legend.text", size = 15) +
  rremove("legend.title") +# removes Marital_status on legend
  guides(fill = guide_legend(override.aes = list(alpha = 1, colour = "black")))  # Choose any colour you want

plot_histogram(Data_cor,nrow = 2, ncol = 2)

# Have some sort of matrix for these because otherwise it is too messy